# Sarima
suppressMessages(library(astsa))
suppressMessages(library(sarima))
# Arima forecast
suppressMessages(library(forecast))
# Skewness Kurtosis
suppressMessages(library(moments))
# Seasonality test
suppressMessages(library(seastests))
# Plotly
suppressMessages(library(plotly))
# Anomalies
suppressMessages(library(anomalize))
# Time series Manipulation
suppressMessages(library(xts))
# Plots
suppressMessages(library(ggplot2))
# A set of packages 
suppressMessages(library(tidyverse))
# Converting everything to everything
suppressMessages(library(tsbox))
#  'ggpubr' provides easy-to-use functions for creating and customizing 'ggplot2'- based publication ready plots
suppressMessages(library(ggpubr))
# This package has functions that plot decomposed time series data with ggplot2 
suppressMessages(library(ggplottimeseries))
# Publication ready theme
theme_set(theme_pubr())
# tables
suppressMessages(library(knitr))
suppressMessages(library(kableExtra))
# Missing data mapping
suppressMessages(library(naniar))
suppressMessages(library(pander))

Abstract

The aim of this project was to construct a SARIMA time series model based on the time series of industrial production of Austria obtained from http://datamarket.com web site. After exploratory data analysis appropriate transformations, five possible models have been estimated. The best model was selected by using AIC criterion and diagnostic checking. Once SARIMA (2,1,0) (0,1,1) model was constructed and fitted it was utilized for forecasting and simulation.

Data

#CSV Data -Data import from CSV file
data_csv <- read.csv('austria-qu.csv')

# Missing data mapping
vis_miss(data_csv)

#Time Series Data
data_ts<-ts(data_csv[,2],start = c(1975,4),frequency = 4)

# Conversion to tibble
data_tibble<-ts_tibbletime(data_ts)
## Loading required namespace: tibbletime
#Extend Tibble Data
data_tibble$month <- factor(format(data_tibble$time, "%b"))
data_tibble$year <- factor(format(data_tibble$time, "%Y"))
qOrder=c('Q2','Q1','Q3','Q4')
data_tibble$quarter<-factor(data_tibble$month,labels=qOrder)

pander(data_ts)
  Q1 Q2 Q3 Q4
1975 NA NA NA 54.1
1976 59.5 56.5 63.9 57.8
1977 62 58.5 65 59.6
1978 63.6 60.4 66.3 60.6
1979 66.8 63.2 71 66.5
1980 72 67.8 75.6 69.2
1981 74.1 70.7 77.8 72.3
1982 78.1 72.4 82.6 72.9
1983 79.5 72.6 82.8 76
1984 85.1 80.5 89.1 84.8
1985 94.2 89.5 99.3 93.1
1986 103.5 96.4 107.2 101.7
1987 109.5 101.3 112.6 105.5
1988 115.4 108 129.9 112.4
1989 123.6 114.9 131 122.6
1990 131.9 120.5 130.7 115.7
1991 119.7 109.7 125.1 NA
pander(head(data_tibble))
time value month year quarter
1975-10-01 54.1 Oct 1975 Q4
1976-01-01 59.5 Jan 1976 Q1
1976-04-01 56.5 Apr 1976 Q2
1976-07-01 63.9 Jul 1976 Q3
1976-10-01 57.8 Oct 1976 Q4
1977-01-01 62 Jan 1977 Q1

Exploratory Data Analysis

Observations and Trend

trend <- ma(data_ts, order = 4, centre = T)
trend_ts<- as.ts(trend)
trend_tibble<-ts_tibbletime(trend_ts)

p1<-ggplot(data=data_tibble,mapping=aes(x=time,y=value))+
  geom_line(color='darkblue',size=1)+
  theme_gray()+
  labs(title = 'Time Series', x='Time', y='Observed Values')
fig1<-ggplotly(p1)
fig1
p2<-ggplot(data=trend_tibble,mapping=aes(x=time,y=value))+
  geom_line(color='darkblue', size=1)+
  theme_gray()+
  labs(title = 'Trend', x='Time', y='Trend')
fig2<-ggplotly(p2)
fig2

Anomalize

x<-as.yearqtr(data_csv$Year)
x<-as.Date(x)
y<-data_csv$Production
tbl<-tibble(x,y)
p1 <- tbl %>%
    time_decompose(y) %>%
    anomalize(remainder, alpha = 0.05, max_anoms = 0.2) %>%
    plot_anomaly_decomposition() +
    ggtitle("Anomaly Decomposition")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = x
## frequency = 4 quarters
## trend = 17 quarters
p2 <- tbl %>%
    time_decompose(y) %>%
    anomalize(remainder, alpha = 0.05, max_anoms = 0.2) %>%
    time_recompose() %>%
    plot_anomalies(time_recomposed = TRUE) +
    geom_line(color='darkblue')+
    ggtitle("Anomalies")
## Converting from tbl_df to tbl_time.
## Auto-index message: index = x
## frequency = 4 quarters
## trend = 17 quarters
p3<-tbl %>%
  time_decompose(y) %>%
  anomalize(remainder,alpha = 0.05, max_anoms = 0.2) %>%
  time_recompose() %>%
  filter(anomaly == 'Yes')
## Converting from tbl_df to tbl_time.
## Auto-index message: index = x
## frequency = 4 quarters
## trend = 17 quarters
p1

p2

p4<-tibble(x=p3$x,
           observerd=p3$observed,
           seson=p3$season,
           trend=p3$trend,
           remainder=p3$remainder,
           anomaly=p3$anomaly)
p4%>%pander()
x observerd seson trend remainder anomaly
1988-07-01 129.9 4.127 114.2 11.55 Yes
1989-07-01 131 4.127 117.3 9.524 Yes
1989-10-01 122.6 -3.214 117.5 8.331 Yes
1990-01-01 131.9 2.459 117.9 11.58 Yes
1990-07-01 130.7 4.127 118.6 7.956 Yes

Seasonality

The WO-test gives out a TRUE if the series is seasonal and FALSE otherwise.

summary(wo(data_ts))
## Test used:  WO 
##  
## Test statistic:  1 
## P-value:  0 0.01518954 5.843575e-05 
##  
## The WO - test identifies seasonality
# This function converts time series-class data into a data frame of decomposed time series.
df <- dts2(data_ts, type ="additive")
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# plots decomposed time series into one figure
p<-ggdecompose(df)+
  theme_gray()+
  xlab("Time")+
  ylab("Index")
p
## Warning: Removed 2 row(s) containing missing values (geom_path).

p<-ggseasonplot(data_ts) +
  theme_gray()+
  ylab("Index") +
  ggtitle("Seasonal Plot")
p

p<-ggsubseriesplot(data_ts) +
  theme_gray()+
  ylab("Index") +
  ggtitle("Seasonal subseries plot")
p

Box Plot

p<-ggplot(data_tibble,aes(x=factor(quarter,levels = c('Q1','Q2','Q3','Q4'),ordered=TRUE),y=value))+
  geom_boxplot()+
  theme_gray()+
  labs(title = 'BoxPlot', x='Season', y='Index')
fig<-ggplotly(p)
fig
# To obtain stats values and  plot non-ggplot box-plot
z<-boxplot(data_ts~cycle(data_ts),xlab='Quarter',col='beige', 
           ylab='Industrial Production Index', ylim=c(50,140), axes=FALSE)
headsv<-c('Q1','Q2','Q3','Q4')
headsh<-c('Minimum','25th Percentile','Median','75th Percentile','Maximum')
zdf<-data.frame(z$stats)
rownames(zdf)<-headsh
colnames(zdf)<-headsv
zdf%>%
  kable%>%
  kable_styling(bootstrap_options = c("responsive"), full_width = F, position = "left")%>%
  column_spec(1:1, bold = T) 
Q1 Q2 Q3 Q4
Minimum 59.50 56.50 63.90 54.10
25th Percentile 69.40 65.50 73.30 63.55
Median 82.30 76.55 85.95 74.45
75th Percentile 112.45 104.65 118.85 103.60
Maximum 131.90 120.50 131.00 122.60

Data Transformation

Differencing

diffdata<-diff(diff(data_ts,4))
tibble_diff<-ts_tibbletime(diffdata)
p<-ggplot(data=tibble_diff,mapping=aes(x=time,y=value))+ 
  geom_line(color='darkblue')+
  theme_gray()+
  labs(title = 'Differenced Series', x='Time', y='Diffrenced Series')
fig<-ggplotly(p)
fig

Lambda Parameter

lmbd<-BoxCox.lambda(data_ts, method=c("loglik"), lower=-1,upper=1)
pander(cat('lambda=',lmbd))

lambda= -0.65

Box-Cox Transformation

trdata<-forecast::BoxCox(data_ts,lmbd)
difftrdata<-diff(diff(trdata,4))
tibble_difftr<-ts_tibbletime(difftrdata)
p<-ggplot(data=tibble_difftr,mapping=aes(x=time,y=value))+ 
  geom_line(color='darkblue')+
  theme_gray()+
  labs(title = 'Differenced and Transformed Series', x='Time', y='Differenced and Transformed Series')
fig<-ggplotly(p)
fig

ACF and PACF

p <- ggAcf(difftrdata)+
  theme_gray()+
  labs(title = 'ACF')
fig<-ggplotly(p)
fig
p <- ggPacf(difftrdata)+
  theme_gray()+
  labs(title = 'PACF')
fig<-ggplotly(p)
fig

Fitting

Based on Summary of rules for identifying ARIMA models https://people.duke.edu/~rnau/arimrule.htm

fit1<-Arima(data_ts,order=c(0,1,0), seasonal=c(0,1,1),lambda=lmbd)
fit2<-Arima(data_ts,order=c(1,1,0), seasonal=c(0,1,1),lambda=lmbd)
fit3<-Arima(data_ts,order=c(0,1,1), seasonal=c(0,1,1),lambda=lmbd)
fit4<-Arima(data_ts,order=c(2,1,0), seasonal=c(0,1,1),lambda=lmbd)
fit5<-Arima(data_ts,order=c(1,1,1), seasonal=c(0,1,1),lambda=lmbd)

AIC Criteria - Model Selection

aics<-c(fit1$aic,fit2$aic,fit3$aic,fit4$aic,fit5$aic)
aics
## [1] -610.7737 -611.3176 -610.4027 -613.8653 -611.6431
names<-c('fit1','fit2','fit3','fit4','fit5')
aicset<-data.frame(names,aics)
oset<-aicset[order(aics),]
kable(oset)%>%
  kable_styling(bootstrap_options = c("responsive"), full_width = F, position='left') %>%
  row_spec(1:1, bold = T, color = "white", background = "darkblue")
names aics
4 fit4 -613.8653
5 fit5 -611.6431
2 fit2 -611.3176
1 fit1 -610.7737
3 fit3 -610.4027
on<-which(aics == min(aics))
cat('Minimum AIC =',aics[on],' fit',on)
## Minimum AIC = -613.8653  fit 4

Auto ARIMA

Auto Arima suggested model

auto.arima(data_ts, d = 1, D = 1, max.p = 2, max.q = 2, max.P = 2,
  max.Q = 2, max.order = 5, seasonal = TRUE, lambda=lmbd)
## Series: data_ts 
## ARIMA(2,1,0)(0,1,1)[4] 
## Box Cox transformation: lambda= -0.65 
## 
## Coefficients:
##           ar1     ar2     sma1
##       -0.1529  0.2860  -0.7189
## s.e.   0.1276  0.1318   0.1227
## 
## sigma^2 estimated as 1.736e-06:  log likelihood=310.93
## AIC=-613.87   AICc=-613.12   BIC=-605.56

Confirmatiory Data Analysis

ACF and PACF Residuals

resd<-fit4$residuals
p <- ggAcf(resd)+
  theme_gray()+
  labs(title = 'Residuals ACF')
fig<-ggplotly(p)
fig
p <- ggPacf(resd)+
  theme_gray()+
  labs(title = 'Residuals PACF')
fig<-ggplotly(p)
fig

Residuals - Histogram

rdf<-data.frame(resd)

p <- ggplot(rdf, aes(x=resd)) +
  geom_histogram(aes(y = ..density..), fill = "lightgrey", color="darkblue", size=.2,binwidth=.001) +
  geom_density(color = "red") +
  geom_vline(aes(xintercept=mean(resd, na.rm=T)),   
               color="red", linetype="dashed", size=.5)+
  theme_gray()+
  labs(x="")+
  labs(y="Density")+
  ggtitle("Density with Histogram Overlay")
fig <- ggplotly(p)
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
fig

Q-Q plot, Shapiro test, skewness and kurtosis

p<-ggplot(rdf, aes(sample=resd))+
  stat_qq(color='red', shape=1)+
  stat_qq_line(color='darkblue')+
  theme_gray()+
  ggtitle("QQ Plot")
ggplotly(p)
shapiro.test(resd)
## 
##  Shapiro-Wilk normality test
## 
## data:  resd
## W = 0.9942, p-value = 0.9915
sk<-skewness(resd)
kt<-kurtosis(resd)
# sk
# kt
skdf<-data.frame(Parameter=c('Skewnes','Kurtosis'),
                 Value=c(sk,kt),
                 Ideal=c(0,3),
                 Diff=c(0-sk,3-kt))
kable(skdf)%>%
  kable_styling(bootstrap_options = c("responsive"), full_width = F, position = "left")
Parameter Value Ideal Diff
Skewnes 0.0698136 0 -0.0698136
Kurtosis 2.6216425 3 0.3783575

Original and Fitted Series

tibble_original<-ts_tibbletime(fit4$x)
tibble_fitted<-ts_tibbletime(fit4$fitted)

tibble_2<-tibble_original
tibble_2$fitted<-tibble_fitted$value
tibble_2$original<-tibble_original$value
tibble_2<-gather(tibble_2,series,val,original:fitted)

print(tibble_2)
## # A time tibble: 128 x 4
## # Index: time
##    time       value series     val
##    <date>     <dbl> <chr>    <dbl>
##  1 1975-10-01  54.1 original  54.1
##  2 1976-01-01  59.5 original  59.5
##  3 1976-04-01  56.5 original  56.5
##  4 1976-07-01  63.9 original  63.9
##  5 1976-10-01  57.8 original  57.8
##  6 1977-01-01  62   original  62  
##  7 1977-04-01  58.5 original  58.5
##  8 1977-07-01  65   original  65  
##  9 1977-10-01  59.6 original  59.6
## 10 1978-01-01  63.6 original  63.6
## # ... with 118 more rows
p<-ggplot(tibble_2,aes(x=time,y=val,color=series))+
  geom_point()+
  geom_line()+
  theme(legend.position = "top")+
  theme_gray()+
  scale_color_manual(values=c("darkblue", "red"))+
  ggtitle("Original and Fitted Series")
fig <- ggplotly(p)
fig

SARIMA Model Summary

astsa::sarima(trdata, 2, 1, 0, P = 0, D = 1, Q = 1, S = 4, 
       details = TRUE, Model=TRUE)
## initial  value -6.495164 
## iter   2 value -6.637605
## iter   3 value -6.660639
## iter   4 value -6.676935
## iter   5 value -6.684088
## iter   6 value -6.688198
## iter   7 value -6.690198
## iter   8 value -6.690400
## iter   9 value -6.690404
## iter  10 value -6.690407
## iter  10 value -6.690407
## iter  10 value -6.690407
## final  value -6.690407 
## converged
## initial  value -6.687495 
## iter   2 value -6.688901
## iter   3 value -6.688982
## iter   4 value -6.688983
## iter   4 value -6.688983
## iter   4 value -6.688983
## final  value -6.688983 
## converged

## $fit
## 
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, 
##     Q), period = S), include.mean = !no.constant, transform.pars = trans, fixed = fixed, 
##     optim.control = list(trace = trc, REPORT = 1, reltol = tol))
## 
## Coefficients:
##           ar1     ar2     sma1
##       -0.1529  0.2860  -0.7189
## s.e.   0.1276  0.1318   0.1227
## 
## sigma^2 estimated as 1.476e-06:  log likelihood = 310.93,  aic = -613.87
## 
## $degrees_of_freedom
## [1] 56
## 
## $ttable
##      Estimate     SE t.value p.value
## ar1   -0.1529 0.1276 -1.1990  0.2356
## ar2    0.2860 0.1318  2.1689  0.0343
## sma1  -0.7189 0.1227 -5.8597  0.0000
## 
## $AIC
## [1] -9.901052
## 
## $AICc
## [1] -9.894378
## 
## $BIC
## [1] -9.767018

Forecast

fc<-forecast(fit4,12)
fcdf<-as.data.frame(fc)
kable(fcdf)%>%
  kable_styling(bootstrap_options = c("responsive"), full_width = F, position='left') 
Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
1991 Q4 113.0014 108.99967 117.2514 106.97500 119.6091
1992 Q1 122.3868 116.47060 128.8158 113.52713 132.4482
1992 Q2 112.6067 105.80155 120.1655 102.46910 124.5125
1992 Q3 127.9640 118.25563 139.0614 113.59775 145.5954
1992 Q4 115.6914 105.71294 127.3252 100.99155 134.2887
1993 Q1 125.2040 112.44497 140.5413 106.53441 149.9614
1993 Q2 115.1707 102.70919 130.3395 96.98583 139.7568
1993 Q3 131.0358 114.42557 152.0369 106.98941 165.5120
1993 Q4 118.3352 102.71239 138.3068 95.76931 151.2464
1994 Q1 128.1835 109.07697 153.5174 100.78650 170.4761
1994 Q2 117.7817 99.81706 141.7832 92.06032 157.9612
1994 Q3 134.2575 110.89694 167.0153 101.11611 190.1021
t1<-seq(as.Date("1975/10/1"), as.Date("1991/07/1"), by = "quarter")
t2<-seq(as.Date("1991/10/1"), as.Date("1994/07/1"), by = "quarter")

df1<-data.frame(time = t1, value = data_csv$Production, series = "observations")
df2<-data.frame(time = t2, value = fcdf$`Point Forecast`, series = "forecast")
df3<-data.frame(time = t2, value = fcdf$`Lo 80`, series = "lo80")
df4<-data.frame(time = t2, value = fcdf$`Hi 80`, series = "hi80")
df5<-data.frame(time = t2, value = fcdf$`Lo 95`, series = "lo95")
df6<-data.frame(time = t2, value = fcdf$`Hi 95`, series = "hi95")

p<-ggplot() +
  geom_line(data=df1, mapping=aes(x = time, y = value,color=series), size=.4) +
  geom_line(data=df2, mapping=aes(x = time, y = value,color=series), size=.4)+
  geom_point(data=df2, mapping=aes(x = time, y = value),color='red', size=.8)+
  geom_ribbon(aes(ymin=df5$value, ymax=df6$value, x=df2$time),fill = "grey50", alpha = 0.2)+
  geom_ribbon(aes(ymin=df3$value, ymax=df4$value, x=df2$time),fill = "grey70", alpha = 0.2)+
  scale_color_manual(values=c("red", "darkblue"))+
  theme_gray()+
  labs(x="Time")+
  labs(y="Observationsand Forecast")+
  ggtitle("Forecast")
fig<-ggplotly(p)
fig

Time Series Cross-Validation

Using tsCV function we measure the predictive performance of all five models

far1<-function(x, h){forecast(Arima(data_ts,order=c(0,1,0), seasonal=c(0,1,1),lambda=lmbd), h)}
far2<-function(x, h){forecast(Arima(data_ts,order=c(1,1,0), seasonal=c(0,1,1),lambda=lmbd), h)}
far3<-function(x, h){forecast(Arima(data_ts,order=c(0,1,1), seasonal=c(0,1,1),lambda=lmbd), h)}
far4<-function(x, h){forecast(Arima(data_ts,order=c(2,1,0), seasonal=c(0,1,1),lambda=lmbd), h)}
far5<-function(x, h){forecast(Arima(data_ts,order=c(1,1,1), seasonal=c(0,1,1),lambda=lmbd), h)}

e1<-tsCV (data_ts, far1, h = 12)
avg_e1<-mean(e1^2,na.rm=TRUE)
e2<-tsCV (data_ts, far2, h = 12)
avg_e2<-mean(e2^2,na.rm=TRUE)
e3<-tsCV (data_ts, far3, h = 12)
avg_e3<-mean(e3^2,na.rm=TRUE)
e4<-tsCV (data_ts, far4, h = 12)
avg_e4<-mean(e4^2,na.rm=TRUE)
e5<-tsCV (data_ts, far5, h = 12)
avg_e5<-mean(e5^2,na.rm=TRUE)
  
errors<-c(avg_e1,avg_e2,avg_e3,avg_e4,avg_e5) 
names<-c('MSE 1','MSE 2','MSE 3','MSE 4','MSE 5')
errorset<-data.frame(names,errors)
oerrorset<-errorset[order(errors),]

kable(oerrorset)%>%
  kable_styling(bootstrap_options = c("responsive"), full_width = F, position='left') %>%
  row_spec(1:1, bold = T, color = "white", background = "darkblue")
names errors
2 MSE 2 1359.492
3 MSE 3 1359.795
1 MSE 1 1375.393
4 MSE 4 1472.709
5 MSE 5 1489.297
min.mse<-which(errors==min(errors))
cat('Minimum MSE =',errors[min.mse],' fit ',min.mse)
## Minimum MSE = 1359.492  fit  2

Min MSE Model Summary

astsa::sarima(trdata, 1, 1, 0, P = 0, D = 1, Q = 1, S = 4, 
       details = TRUE, Model=TRUE)
## initial  value -6.503379 
## iter   2 value -6.623008
## iter   3 value -6.627969
## iter   4 value -6.648726
## iter   5 value -6.656746
## iter   6 value -6.659204
## iter   7 value -6.659887
## iter   8 value -6.660151
## iter   9 value -6.660155
## iter   9 value -6.660155
## iter   9 value -6.660155
## final  value -6.660155 
## converged
## initial  value -6.649834 
## iter   2 value -6.650443
## iter   2 value -6.650443
## iter   2 value -6.650443
## final  value -6.650443 
## converged

## $fit
## 
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, 
##     Q), period = S), include.mean = !no.constant, transform.pars = trans, fixed = fixed, 
##     optim.control = list(trace = trc, REPORT = 1, reltol = tol))
## 
## Coefficients:
##           ar1     sma1
##       -0.2079  -0.6620
## s.e.   0.1291   0.1261
## 
## sigma^2 estimated as 1.608e-06:  log likelihood = 308.66,  aic = -611.32
## 
## $degrees_of_freedom
## [1] 57
## 
## $ttable
##      Estimate     SE t.value p.value
## ar1   -0.2079 0.1291 -1.6111  0.1127
## sma1  -0.6620 0.1261 -5.2520  0.0000
## 
## $AIC
## [1] -9.859961
## 
## $AICc
## [1] -9.85668
## 
## $BIC
## [1] -9.759435

Simulation

simm<-sim_sarima(n=64, model=list(sma=fit4$coef[1],ar= fit4$coef[2],
                ma=fit4$coef[3],iorder=1, siorder=1, nseasons=4), rand.gen = rnorm)
simm_df<-data.frame(time=t1,value=simm)
p<-ggplot(simm_df,aes(x=time,y=value))+
  geom_point(color='darkblue')+
  geom_line(color='darkblue')+
  theme_gray()+
  xlab('Time') +
  ylab('Value')+
  ggtitle("Simulation")
fig<-ggplotly(p)
fig